Parametri globali
alpha=0.2
nfold=10
Import librerie
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from lime import lime_text
import unicodedata
Import dataset
df = pd.read_excel('./politica.xlsx', sheet_name="Foglio1")
print(f'Found {len(df)} texts.')
print(f'{df["cap_maj_master"].isnull().sum()} document(s) with no classification removed')
df=df[pd.notnull(df['cap_maj_master'])]
print(f'{df["testo"].isnull().sum()} document(s) with no text removed')
df=df[pd.notnull(df['testo'])]
classes = [int(c) for c in df['cap_maj_master']]
documents = [d for d in df['testo']]
y = np.bincount(classes)
x = np.arange(len(y))
fig, ax = plt.subplots()
plt.bar(x, y,width=0.7)
ax.set_xticks(x)
ax.set_aspect('auto')
plt.show()
preprocessing
def preprocessor(text):
text = re.sub('<[^>]*>', ' ', str(text))
text=re.sub('\d+',' ',str(text))
text=re.sub('[ᆱᄏ]','',str(text))
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
str(text))
text = (re.sub('[\W]+', ' ', text.lower()) + ' ' +
' '.join(emoticons).replace('-', ''))
return text
def strip_accents(text):
"""
Strip accents from input String.
:param text: The input string.
:type text: String.
:returns: The processed String.
:rtype: String.
"""
try:
text = unicode(text, 'utf-8')
except (TypeError, NameError): # unicode is a default on python 3
pass
text = unicodedata.normalize('NFD', text)
text = text.encode('ascii', 'ignore')
text = text.decode("utf-8")
return str(text)
stop=set(stopwords.words('italian'))
def tokenizer_porter(text):
word_tokens = word_tokenize(text)
stemmer = SnowballStemmer("italian", ignore_stopwords=True)
return [stemmer.stem(word) for word in word_tokens]
def tokenizer(text):
stop=set(stopwords.words('italian'))
word_tokens = word_tokenize(text)
filtered_sentence = [w for w in word_tokens if not w in stop]
filtered_sentence = [w for w in filtered_sentence if len(w)>3]
return filtered_sentence
create vocabolary
tfidf = TfidfVectorizer(strip_accents=strip_accents,
lowercase=False,
preprocessor=preprocessor,
tokenizer=tokenizer_porter,
stop_words=stop,
min_df = 4
)
final_features = tfidf.fit_transform(documents)
X_names=tfidf.get_feature_names()
x_best_s=SelectKBest(chi2, k="all")
p_value_limit = 0.95
sns.heatmap(final_features.todense()[:,np.random.randint(0,final_features.shape[1],100)]==0, vmin=0, vmax=1, cbar=False).set_title('Sparse Matrix Sample')
dtf_features = pd.DataFrame()
for cat in np.unique(classes):
appoggio=(cat==classes)
appoggio.astype(int)
xbest=x_best_s.fit_transform(final_features, appoggio)
p = x_best_s.pvalues_
dtf_features = dtf_features.append(pd.DataFrame(
{"feature":X_names, "score":1-p, "y":cat}))
dtf_features = dtf_features.sort_values(["y","score"],
ascending=[True,False])
dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
X_names = dtf_features["feature"].unique().tolist()
for cat in np.unique(classes):
print("# {}:".format(cat))
print(" . selected features:",
len(dtf_features[dtf_features["y"]==cat]))
print(" . top features:", ",".join(
dtf_features[dtf_features["y"]==cat]["feature"].values[:10]))
print(" ")
tfidf = TfidfVectorizer(strip_accents=strip_accents,
lowercase=False,
preprocessor=preprocessor,
tokenizer=tokenizer_porter,
stop_words=stop,
min_df = 4,
vocabulary=X_names
)
final_features = tfidf.fit_transform(documents)
sns.heatmap(final_features.todense()[:,np.random.randint(0,final_features.shape[1],100)]==0, vmin=0, vmax=1, cbar=False).set_title('Sparse Matrix Sample')
print(final_features.toarray().shape)
pipe_lr = make_pipeline(
tfidf,
ComplementNB(alpha=alpha)
)
X=np.array(documents)
y=np.array(classes)
kfold = StratifiedKFold(n_splits=nfold,shuffle=True,random_state=1).split(X, y)
accuracys = []
scores= []
target_names=list(map(str,np.unique(classes)))
for k, (train, test) in enumerate(kfold):
pipe_lr.fit(X[train], y[train])
y_pred=pipe_lr.predict(X[test])
y_prob=pipe_lr.predict_proba(X[test])
y_test_array = pd.get_dummies(y[test], drop_first=False).values
## calcolo accuracy
accuracy = pipe_lr.score(X[test], y[test])
accuracys.append(accuracy)
## calcolo precision,recall,fscore
auc = metrics.roc_auc_score(y[test], y_prob, multi_class="ovr")
score=precision_recall_fscore_support(y_true=y[test], y_pred=y_pred, average="weighted")
scores.append(score[0:3])
print('--------------- Fold: %2d ---------------------'% (k+1))
print()
print("Accuracy:", round(accuracy,2))
print("Auc:", round(auc,2))
print("Detail:")
print(metrics.classification_report(y[test], y_pred))
## Plot confusion matrix
conf_mat = confusion_matrix(y[test], y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d', ax=ax, cbar=False,cmap=plt.cm.Blues)
ax.set(xlabel="Predicted", ylabel="Actual", xticklabels=target_names,
yticklabels=target_names, title="Confusion matrix")
plt.yticks(rotation=0)
print()
## Plot Roc
fig, ax = plt.subplots(figsize=(10,10))
for i in range(len(target_names)):
fpr, tpr, thresholds = metrics.roc_curve(y_test_array[:,i],
y_prob[:,i])
ax.plot(fpr, tpr, lw=3,
label='{0} (area={1:0.2f})'.format(target_names[i],
metrics.auc(fpr, tpr))
)
ax.plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
ax.set(xlim=[-0.05,1.0], ylim=[0.0,1.05],
xlabel='False Positive Rate',
ylabel="True Positive Rate (Recall)",
title="Receiver operating characteristic")
ax.legend(loc="lower right")
ax.grid(True)
## Plot precision-recall curve
fig, ax = plt.subplots(figsize=(10,10))
for i in range(len(target_names)):
precision, recall, thresholds = metrics.precision_recall_curve(
y_test_array[:,i], y_prob[:,i])
ax.plot(recall, precision, lw=3,
label='{0} (area={1:0.2f})'.format(target_names[i],
metrics.auc(recall, precision))
)
ax.set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall',
ylabel="Precision", title="Precision-Recall curve")
ax.legend(loc="best")
ax.grid(True)
plt.show()
## select observation
i = 0
txt_instance = X[test][i]
## check true value and predicted value
print("True:", y[test][i], "--> Pred:", y_pred[i], "| Prob:", round(np.max(y_prob[i]),2))
## show explanation
explainer = lime_text.LimeTextExplainer(class_names=target_names)
explained = explainer.explain_instance(txt_instance,
pipe_lr.predict_proba, num_features=6,top_labels=2)
explained.show_in_notebook(text=txt_instance, predict_proba=False)
arr = np.array(scores)
print("Overall results of the cross-validation procedure")
print()
print('\nCV accuracy: %.1f +/- %.1f' % (np.mean(accuracys)*100, np.std(accuracys)*100))
print('\nCV precision: %.1f +/- %.1f' % (np.mean(arr[:,0])*100, np.std(arr[:,0])*100))
print('\nCV recall: %.1f +/- %.1f' % (np.mean(arr[:,1])*100, np.std(arr[:,1])*100))
print('\nCV f1: %.1f +/- %.1f' % (np.mean(arr[:,2])*100, np.std(arr[:,2])*100))